Import Library¶

In [ ]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, VisualRepresentation
from bertopic.backend import MultiModalBackend
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from umap import UMAP
import hdbscan
import base64
from io import BytesIO
from IPython.display import HTML
from PIL import Image
import matplotlib.pyplot as plt
import math

Demonstration of Multimodal Topic Modeling¶

This visualization demonstrates the application of topic modeling to a dataset that, while not sourced from the dark web, covers themes related to weapons, drugs, and robbers.
The images used in this demonstration do not derive from the dark web.
The dataset used comes from: Roboflow's Drug Detection Project.
By associating these images with their corresponding topic labels, from dark web, this approach allows us to expand the analysis and enrich it with both textual and visual information.
This multimodal model offers a deeper understanding of complex datasets by integrating diverse data types and can be extended to further analyze and interpret data in various contexts.
The combination of text and imagery provides a robust framework for exploring and categorizing content in a more comprehensive manner.

1° Visual Model Baseline trained on 3k images¶

Clustering Approach¶

  • Parameter Setting:
    • Embedding Model for images: clip-ViT-B-32
    • Representation Model: KeyBERTInspired, VisualRepresentation
    • Visual Model: vit-gpt2-image-captioning with 300 nr_repr_images
    • Count Vectorizer
    • CtfIDF
    • UMAP: 150 neighbors, 10 components
    • HDBSCAN: 50 min cluster size

Clustering Results¶

  • Clusters Retrieved: 4 which include:
    1. Gun
    2. Drug
    3. People with gun / drug
    4. People

Process Datasets¶

In [2]:
TRAIN_FOLDER = 'Datasets/RawData/train'
TEST_FOLDER = 'Datasets/RawData/test'
VALID_FOLDER = 'Datasets/RawData/valid'
In [3]:
def load_image_paths_and_annotations(image_folder: str, annotation_file: str) -> tuple:
    """
    Load image paths and annotations.
    :param image_folder: Folder containing the images.
    :param annotation_file: CSV file containing the annotations.
    :return: Tuple containing a list of image paths and a pandas DataFrame containing the annotations.
    """
    annotations = pd.read_csv(annotation_file)
    image_paths = []
    labels = []

    for _, row in tqdm(annotations.iterrows(), total=annotations.shape[0], desc='Loading image paths'):
        try:
            img_filename = row['filename']
            img_path = os.path.join(image_folder, img_filename)
            if os.path.exists(img_path):
                image_paths.append(img_path)
                labels.append(row['class'])
            else:
                print(f'Image file {img_filename} does not exist at {img_path}')
        except Exception as e:
            print(f'Error processing image {img_filename}: {e}')
            continue

    return image_paths, labels, annotations

def image_base64(im: str) -> str:
    """
    Convert an image to base64.
    :param im: Path to the image.
    :return: Base64 encoding of the image.
    """
    if isinstance(im, str):
        im = get_thumbnail(im)
    with BytesIO() as buffer:
        im.save(buffer, 'jpeg')
        return base64.b64encode(buffer.getvalue()).decode()

def image_formatter(im: str) -> str:
    """
    Display an image in a Jupyter notebook.
    :param im: Path to the image.
    :return: HTML image tag.
    """
    return f'<img src="data:image/jpeg;base64,{image_base64(im)}">'
In [ ]:
test_images, test_labels, test_annotations = load_image_paths_and_annotations(TEST_FOLDER, os.path.join(TEST_FOLDER, '_annotations.csv'))
val_images, val_labels, val_annotations = load_image_paths_and_annotations(VALID_FOLDER, os.path.join(VALID_FOLDER, '_annotations.csv'))
train_images, train_labels, train_annotations = load_image_paths_and_annotations(TRAIN_FOLDER, os.path.join(TRAIN_FOLDER, '_annotations.csv'))
In [5]:
images = train_images + test_images
labels = train_labels + test_labels

Prepare Model¶

Pre-Compute embeddings¶

In [ ]:
'''
embedding_model = MultiModalBackend('clip-ViT-B-32', batch_size=32)
embedd = embedding_model.embed(documents=labels, images=images, verbose=True)
embedd_only_images = embedding_model.embed_images(images=images, verbose=True)
np.savez_compressed('Embeddings/embedding_images.npz', embedd)
np.savez_compressed('Embeddings/embedding_only_images.npz', embedd_only_images)
'''
In [7]:
with np.load('Embeddings/embedding_only_images.npz') as data1, np.load('Embeddings/embedding_images.npz') as data2:
    embedd_only_images = data1['arr_0']
    embedd = data2['arr_0']

Build BERTopic Model¶

In [ ]:
kw = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=150, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Embeddings image models
embedding_model = MultiModalBackend('clip-ViT-B-32', batch_size=32)

# Visual model
visual_model = VisualRepresentation(image_to_text_model="nlpconnect/vit-gpt2-image-captioning", nr_samples=20, nr_repr_images=500, image_height=800)

representation_model = {
   "Visual_Aspect":  visual_model,
   "KeyBERTInspired": kw
}

topic_model = BERTopic(
                      min_topic_size=50,
                      top_n_words=5,
                      n_gram_range=(1, 3),
                      representation_model=representation_model,
                      vectorizer_model=vectorizer_model,
                      ctfidf_model=ctfidf_model,
                      embedding_model=embedding_model,
                      umap_model=umap_model,
                      hdbscan_model=hdbscan_model,
                      verbose=True)

topics, probs = topic_model.fit_transform(documents=labels, images=images, embeddings=embedd_only_images)

Show Results¶

In [10]:
# Extract dataframe
df = topic_model.get_topic_info().drop(["Representative_Docs", "Name", "Representation"], axis=1)[["Topic", "Count", "KeyBERTInspired", "Visual_Aspect"]]

# Visualize the images
HTML(df.to_html(formatters={'Visual_Aspect': image_formatter}, escape=False))
Out[10]:
Topic Count KeyBERTInspired Visual_Aspect
0 -1 160 [drug, people, , , , , , , , ] No description has been provided for this image
1 0 2162 [gun, , , , , , , , , ] No description has been provided for this image
2 1 317 [drug, , , , , , , , , ] No description has been provided for this image
3 2 280 [people, , , , , , , , , ] No description has been provided for this image
4 3 71 [people, drug, , , , , , , , ] No description has been provided for this image

Model Graphs¶

In [11]:
topic_model.visualize_barchart()

image-2.png

In [14]:
topic_model.visualize_topics()

image-2.png

In [15]:
topic_model.visualize_heatmap()

image-2.png

In [16]:
topic_model.visualize_hierarchy()

image-2.png

In [17]:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embedd)
In [19]:
topic_model.visualize_documents(labels, reduced_embeddings=reduced_embeddings, hide_document_hover=True)

image-3.png

In [ ]:
topic_model.visualize_document_datamap(labels, embeddings=embedd)

4DataMap-3.png

In [21]:
topic_model.visualize_term_rank(log_scale=True)

image-2.png

Save Model¶

In [22]:
topic_model.save("Models/topic_visual_model_safetensors", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)
In [ ]:
topic_model.push_to_hf_hub(
    repo_id="D0men1c0/ISSR_Visual_Model",
    save_embedding_model=embedding_model,
    save_ctfidf=True
)

Predict¶

In [28]:
topic_model = BERTopic.load('D0men1c0/ISSR_Visual_Model', embedding_model='clip-ViT-B-32')
In [29]:
topic_model.get_topic_info()
Out[29]:
Topic Count Name Representation KeyBERTInspired Visual_Aspect Representative_Docs
0 -1 160 -1_drug_people_gun_ [drug, people, gun, , ] [drug, people, , , , , , , , ] <PIL.JpegImagePlugin.JpegImageFile image mode=... NaN
1 0 2162 0_gun_people_drug_ [gun, people, drug, , ] [gun, , , , , , , , , ] <PIL.JpegImagePlugin.JpegImageFile image mode=... NaN
2 1 317 1_drug_gun__ [drug, gun, , , ] [drug, , , , , , , , , ] <PIL.JpegImagePlugin.JpegImageFile image mode=... NaN
3 2 280 2_people_gun__ [people, gun, , , ] [people, , , , , , , , , ] <PIL.JpegImagePlugin.JpegImageFile image mode=... NaN
4 3 71 3_people_gun_drug_ [people, gun, drug, , ] [people, drug, , , , , , , , ] <PIL.JpegImagePlugin.JpegImageFile image mode=... NaN
In [ ]:
topic, _ = topic_model.transform(val_labels, images=val_images)
all_topic_info = [topic_model.get_topic_info(t) for t in topic]
all_prediction_info = pd.concat(all_topic_info, ignore_index=True)
In [31]:
sample_images = 100
n_images = min(sample_images, len(val_images))
n_cols = 4
n_rows = math.ceil(n_images / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 3))
axes = axes.flatten()  # Flatten the axes array for easier indexing

for i, (path, (_, row)) in enumerate(zip(val_images[:n_images], all_prediction_info.iterrows())):
    ax = axes[i]
    ax.imshow(Image.open(path))
    ax.axis('off')
    ax.set_title(f"Topic {row['Topic']}: {row['KeyBERTInspired'][0]}")

# Hide unused axes
for j in range(n_images, len(axes)):
    axes[j].axis('off')

plt.tight_layout()
plt.show()
No description has been provided for this image